To Do

from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
import pandas as pd
import json

# Unzip data folder

import zipfile
with zipfile.ZipFile('../../data.zip', 'r') as zip_ref:
    zip_ref.extractall('..')

openai_api_key = '...'

# Temp = 0 so that we get clean information without a lot of creativity
chat_model = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, max_tokens=1000)

# How you would like your response structured. This is basically a fancy prompt template
response_schemas = [
    ResponseSchema(name="input_industry", description="This is the input_industry from the user"),
    ResponseSchema(name="standardized_industry", description="This is the industry you feel is most closely matched to the users input"),
    ResponseSchema(name="match_score",  description="A score 0-100 of how close you think the match is between user input and your match")
]

# How you would like to parse your output
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()
print (output_parser.get_format_instructions())

The output should be a markdown code snippet formatted in the following schema:

```json
{
	"input_industry": string  // This is the input_industry from the user
	"standarized_industry": string  // This is the industry you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```

template = """
You will be given a series of industry names from a user.
Find the best corresponding match on the list of standardized names.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

{format_instructions}

Wrap your final output with closed and open brackets (a list of json objects)

input_industry INPUT:
{user_industries}

STANDARDIZED INDUSTRIES:
{standardized_industries}

YOUR RESPONSE:
"""

prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(template)  
    ],
    input_variables=["user_industries", "standardized_industries"],
    partial_variables={"format_instructions": format_instructions}
)

# Get your standardized names. You can swap this out with whatever list you want!
df = pd.read_csv('../data/LinkedInIndustries.csv')
standardized_industries = ", ".join(df['Industry'].values)
standardized_industries

'Corporate Services, Recreation & Travel, Legal, Wellness & Fitness, Entertainment, Consumer Goods, Design, Arts, Manufacturing, Finance, Health Care, Construction, Nonprofit, Real Estate, Software & IT Services, Hardware & Networking, Agriculture, Education, Public Administration, Transportation & Logistics, Public Safety, Media & Communications, Energy & Mining, Retail'

# Your user input

user_input = "air LineZ, airline, aviation, planes that fly, farming, bread, wifi networks, twitter media agency"

_input = prompt.format_prompt(user_industries=user_input, standardized_industries=standardized_industries)


print (f"There are {len(_input.messages)} message(s)")
print (f"Type: {type(_input.messages[0])}")
print ("---------------------------")
print (_input.messages[0].content)

There are 1 message(s)
Type: <class 'langchain.schema.HumanMessage'>
---------------------------

You will be given a series of industry names from a user.
Find the best corresponding match on the list of standardized names.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

The output should be a markdown code snippet formatted in the following schema:

```json
{
	"input_industry": string  // This is the input_industry from the user
	"standarized_industry": string  // This is the industry you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```

Wrap your final output with closed and open brackets (a list of json objects)

input_industry INPUT:
air LineZ, airline, aviation, planes that fly, farming, bread, wifi networks, twitter media agency

STANDARDIZED INDUSTRIES:
Corporate Services, Recreation & Travel, Legal, Wellness & Fitness, Entertainment, Consumer Goods, Design, Arts, Manufacturing, Finance, Health Care, Construction, Nonprofit, Real Estate, Software & IT Services, Hardware & Networking, Agriculture, Education, Public Administration, Transportation & Logistics, Public Safety, Media & Communications, Energy & Mining, Retail

YOUR RESPONSE:

output = chat_model(_input.to_messages())

print (type(output))
print (output.content)

<class 'langchain.schema.AIMessage'>


[
	{
		"input_industry": "air LineZ",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "80"
	},
	{
		"input_industry": "airline",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "90"
	},
	{
		"input_industry": "aviation",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "95"
	},
	{
		"input_industry": "planes that fly",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "85"
	},
	{
		"input_industry": "farming",
		"standarized_industry": "Agriculture",
		"match_score": "90"
	},
	{
		"input_industry": "bread",
		"standarized_industry": "Consumer Goods",
		"match_score": "80"
	},
	{
		"input_industry": "wifi networks",
		"standarized_industry": "Hardware & Networking",
		"match_score": "95"
	},
	{
		"input_industry": "twitter media agency",
		"standarized_industry": "Media & Communications",
		"match_score": "90"
	}
]

if "```json" in output.content:
    json_string = output.content.split("```json")[1].strip()
else:
    json_string = output.content

print(output.content)

[
	{
		"input_industry": "air LineZ",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "80"
	},
	{
		"input_industry": "airline",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "90"
	},
	{
		"input_industry": "aviation",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "95"
	},
	{
		"input_industry": "planes that fly",
		"standarized_industry": "Transportation & Logistics",
		"match_score": "85"
	},
	{
		"input_industry": "farming",
		"standarized_industry": "Agriculture",
		"match_score": "90"
	},
	{
		"input_industry": "bread",
		"standarized_industry": "Consumer Goods",
		"match_score": "80"
	},
	{
		"input_industry": "wifi networks",
		"standarized_industry": "Hardware & Networking",
		"match_score": "95"
	},
	{
		"input_industry": "twitter media agency",
		"standarized_industry": "Media & Communications",
		"match_score": "90"
	}
]

# output_parser.parse(output.content) Ideally this works but not in all cases
structured_data = json.loads(output.content)
structured_data

[{'input_industry': 'air LineZ',
  'standarized_industry': 'Transportation & Logistics',
  'match_score': '80'},
 {'input_industry': 'airline',
  'standarized_industry': 'Transportation & Logistics',
  'match_score': '90'},
 {'input_industry': 'aviation',
  'standarized_industry': 'Transportation & Logistics',
  'match_score': '95'},
 {'input_industry': 'planes that fly',
  'standarized_industry': 'Transportation & Logistics',
  'match_score': '85'},
 {'input_industry': 'farming',
  'standarized_industry': 'Agriculture',
  'match_score': '90'},
 {'input_industry': 'bread',
  'standarized_industry': 'Consumer Goods',
  'match_score': '80'},
 {'input_industry': 'wifi networks',
  'standarized_industry': 'Hardware & Networking',
  'match_score': '95'},
 {'input_industry': 'twitter media agency',
  'standarized_industry': 'Media & Communications',
  'match_score': '90'}]

pd.DataFrame(structured_data)

	input_industry	standarized_industry	match_score
0	air LineZ	Transportation & Logistics	80
1	airline	Transportation & Logistics	90
2	aviation	Transportation & Logistics	95
3	planes that fly	Transportation & Logistics	85
4	farming	Agriculture	90
5	bread	Consumer Goods	80
6	wifi networks	Hardware & Networking	95
7	twitter media agency	Media & Communications	90

To Do#

Look at new incoming industries from the user
Match against your data base of values you’ve already mapped
For existing ones, save an API call and get the result from the data base
For new ones, batch them together for your LLM to return back to you